import os
from sklearn.model_selection import train_test_split
import pandas as pd
class load_data():

    def read_label_path(self,index_file_path):
        #读取标签和邮件地址
        labels, paths = [], []
        with open(index_file_path) as file:
            index_lines = file.readlines()
            for index_line in index_lines:
                labels.append(index_line.split()[0])
                paths.append(index_line.split()[1])
        return labels,paths
    def read_email(self,index_file_path):
        #根据邮件地址读取邮件
        labels,paths = self.read_label_path(index_file_path)
        contents = []
        for path in paths:
            with open(path,encoding='gbk',errors="ignore") as file:
                contents.append(file.read())
        return labels,contents
    def save_data(self,index_file_path):
        labels,contents = self.read_email(index_file_path)
        x_train,x_test,y_train,y_test = train_test_split(contents,labels,test_size=0.2,random_state=22)
        train_df = pd.DataFrame({'x':x_train,'y':y_train})
        os.chdir('D:/workspace/email_check/data')
        train_df.to_csv('01初始训练集.csv')
        test_df = pd.DataFrame({'x':x_test,'y':y_test})
        test_df.to_csv('02初始测试集.csv')




# os.chdir('D:/workspace/email_check/data/trec06c/full')
# print(os.getcwd())
# a = load_data('index')
# a.save_data()





